Select gene expression predictors of hypomethylation

Get A and B compartment CpGs

library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(foreach)
library(ggplot2)
library(readr)

# Get CpG locations
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
## Loading required package: minfi
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: bumphunter
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.4    2020-03-24
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
#data(IlluminaHumanMethylation450kanno.ilmn12.hg19)
data(Locations)
cpg_gr <- GRanges(Locations$chr, IRanges(Locations$pos, Locations$pos))
mcols(cpg_gr)$cg <- rownames(Locations)
cpg_gr
## GRanges object with 485512 ranges and 1 metadata column:
##            seqnames    ranges strand |              cg
##               <Rle> <IRanges>  <Rle> |     <character>
##        [1]     chrY   9363356      * |      cg00050873
##        [2]     chrY  21239348      * |      cg00212031
##        [3]     chrY   8148233      * |      cg00213748
##        [4]     chrY  15815688      * |      cg00214611
##        [5]     chrY   9385539      * |      cg00455876
##        ...      ...       ...    ... .             ...
##   [485508]    chr22  46114168      * |   ch.22.909671F
##   [485509]    chr22  48451677      * | ch.22.46830341F
##   [485510]    chr22  48731367      * |  ch.22.1008279F
##   [485511]    chr22  49193714      * | ch.22.47579720R
##   [485512]    chr22  49888838      * | ch.22.48274842R
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
# Get CGI and colon A/B compartment annotation
a <- import("../input/compartment_A.bed")
b <- import("../input/compartment_B.bed")

cpg_gr$compartment <- NA
cpg_gr$compartment[countOverlaps(cpg_gr, a)>0] <- "A"
cpg_gr$compartment[countOverlaps(cpg_gr, b)>0] <- "B"
table(cpg_gr$compartment)
## 
##      A      B 
## 285654  86328
cgi <- readRDS("../input/cpgIslandExt_hg19.rds")
left_shore <- flank(cgi, width=2000, start=TRUE, ignore.strand=TRUE)
right_shore <- flank(cgi, width=2000, start=FALSE, ignore.strand=TRUE)
cgi_shore <- reduce(c(cgi, left_shore, right_shore))
cpg_gr$opensea <- ifelse(countOverlaps(cpg_gr, cgi_shore)>0, FALSE, TRUE)

table(cpg_gr$compartment, cpg_gr$opensea)
##    
##      FALSE   TRUE
##   A 172797 112857
##   B  33749  52579

Prepare TCGA clinical, methylation and expression data

library(UCSCXenaTools)
## =========================================================================================
## UCSCXenaTools version 1.4.3
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
## 
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
##   from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
##   Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
##                               --Enjoy it--
## 
## Attaching package: 'UCSCXenaTools'
## The following object is masked from 'package:Biobase':
## 
##     samples
cohort <- "COAD\\."

# Get Clincal data
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>% 
  XenaFilter(filterDatasets = cohort) %>%
  XenaFilter(filterDatasets = "clinicalMatrix") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/COAD_clinicalMatrix
clin = XenaPrepare(xe_download)

# Get 450k DNAm matrix
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>% 
  XenaFilter(filterDatasets = cohort) %>%
  XenaFilter(filterDatasets = "HumanMethylation450") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/HumanMethylation450.gz
dnam_df = XenaPrepare(xe_download)
rn <- dnam_df$sample
dnam <- dnam_df %>% select(-1) %>% as.matrix()
rownames(dnam) <- rn

# Get gene expression matrix
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>% 
  XenaFilter(filterDatasets = cohort) %>% 
  XenaFilter(filterDatasets = "HiSeqV2_PANCAN") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/HiSeqV2_PANCAN.gz
expr_df <- XenaPrepare(xe_download)
rn <- expr_df$sample
expr <- expr_df %>% select(-1) %>% as.matrix()
rownames(expr) <- rn

Estimate A and B compartment open sea methylation

# a_os = A, Open-Sea
table(rownames(dnam) %in% cpg_gr$cg)
## 
##  FALSE   TRUE 
##     65 485512
a_os_idx <- which(rownames(dnam) %in% cpg_gr$cg[cpg_gr$compartment=="A" & cpg_gr$opensea])
b_os_idx <- which(rownames(dnam) %in% cpg_gr$cg[cpg_gr$compartment=="B" & cpg_gr$opensea])
hist(dnam[a_os_idx,1])

hist(dnam[b_os_idx,1])

a_os <- colMeans(dnam[a_os_idx,], na.rm=TRUE)
b_os <- colMeans(dnam[b_os_idx,], na.rm=TRUE)

hypometh <- a_os - b_os

plot(a_os, b_os)
abline(0, 1, col="red")

plot(a_os, hypometh)

keep <- intersect(colnames(expr), names(hypometh))
expr <- expr[,keep]
hypometh <- hypometh[keep]

idx <- which(rowVars(expr)>0.1)
df <- foreach (i = idx, .combine=rbind) %do% {
  coeff <- coef(summary(lm(hypometh ~ expr[i,])))[2,]
  data.frame(gene=rownames(expr)[i], estimate=coeff["Estimate"], p=coeff["Pr(>|t|)"])
}

df %>% ggplot(aes(estimate, -log10(p))) + geom_point() + geom_vline(xintercept = c(-0.04, 0.04), color="red") + geom_hline(yintercept = 10, color="red") + theme_bw()

Select significantly upregulated with hypomethylation genes

hypometh_up_genes <- df %>% filter(estimate>0.04 & -log10(p)>10) %>% pull(gene)
write_tsv(data.frame(symbol=hypometh_up_genes), file="../coad_hypometh_up_genes.txt", col_names=FALSE)
for (g in hypometh_up_genes[1:10]) {
  plot(expr[g,], hypometh, main=g)
}

hypometh_up_score <- colMeans(expr[hypometh_up_genes,])
summary(lm(hypometh ~ hypometh_up_score))
## 
## Call:
## lm(formula = hypometh ~ hypometh_up_score)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.090838 -0.025397 -0.002691  0.021198  0.155253 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.04001    0.00295   13.56   <2e-16 ***
## hypometh_up_score  0.09950    0.00721   13.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03748 on 297 degrees of freedom
## Multiple R-squared:  0.3907, Adjusted R-squared:  0.3887 
## F-statistic: 190.5 on 1 and 297 DF,  p-value: < 2.2e-16
plot(hypometh_up_score, hypometh, main="hypometh_up_score")

Examine hypometh down published genes

s5 <- read_tsv("../input/2020_johnstone_reyes_table_s5.txt")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Gene symbol` = col_character(),
##   Compartment = col_character()
## )
hypometh_down_b <- s5 %>% pull(`Gene symbol`)
df %>% filter(gene %in% hypometh_down_b) %>% ggplot(aes(estimate, -log10(p))) + geom_point()

hypometh_down_score <- colMeans(expr[hypometh_down_b,])
summary(lm(hypometh ~ hypometh_down_score))
## 
## Call:
## lm(formula = hypometh ~ hypometh_down_score)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.098050 -0.027497 -0.000965  0.023295  0.145629 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.028480   0.004199   6.783 6.37e-11 ***
## hypometh_down_score -0.032278   0.002881 -11.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04026 on 297 degrees of freedom
## Multiple R-squared:  0.2971, Adjusted R-squared:  0.2947 
## F-statistic: 125.5 on 1 and 297 DF,  p-value: < 2.2e-16
plot(hypometh_down_score, hypometh, main="coad_hypometh_down_score")

plot(hypometh_down_score, hypometh_up_score)

Look correlation between hypomethylation and A/B gene expression difference

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
library(org.Hs.eg.db)
## 
entrez_id <- mapIds(org.Hs.eg.db, rownames(expr), "ENTREZID", "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
symbol <- unlist(mapIds(org.Hs.eg.db, entrez_id,  "SYMBOL", "ENTREZID"))
## 'select()' returned 1:1 mapping between keys and columns
gene_pos <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, 
       keys = entrez_id[rownames(expr)], 
       columns=c("TXNAME", "TXCHROM", "TXSTART"), 
       keytype="GENEID")
## 'select()' returned many:many mapping between keys and columns
gene_pos <- gene_pos %>% filter(!is.na(TXCHROM) & !is.na(TXSTART))
gene_pos$symbol <- symbol[gene_pos$GENEID]
gene_pos <- gene_pos[!duplicated(gene_pos$symbol),]
gene_tss_gr <- GRanges(gene_pos$TXCHROM, IRanges(gene_pos$TXSTART, gene_pos$TXSTART))
mcols(gene_tss_gr) <- gene_pos[, c("GENEID", "symbol")]
gene_tss_gr$compartment <- NA
gene_tss_gr$compartment[countOverlaps(gene_tss_gr, a)>0] <- "A"
gene_tss_gr$compartment[countOverlaps(gene_tss_gr, b)>0] <- "B"

a_idx <- which(rownames(expr) %in% gene_tss_gr$symbol[gene_tss_gr$compartment=="A"])
b_idx <- which(rownames(expr) %in% gene_tss_gr$symbol[gene_tss_gr$compartment=="B"])

expr_a <- colMeans(expr[a_idx,], na.rm=TRUE)
expr_b <- colMeans(expr[b_idx,], na.rm=TRUE)
plot(expr_a, expr_b)

plot(hypometh, expr_a)

plot(hypometh, expr_b)

plot(hypometh, expr_a-expr_b)

write_tsv(data.frame(symbol=rownames(expr)[a_idx]), file="../coad_a_genes.txt", col_names=FALSE)
write_tsv(data.frame(symbol=rownames(expr)[b_idx]), file="../coad_b_genes.txt", col_names=FALSE)

Repeats - no obvious correlation using a handful of putative repeat-associated genes

library(stringr)
idx <- str_which(rownames(expr), "^L1")
rownames(expr)[idx]
## [1] "L1CAM" "L1TD1"
g <- "L1TD1"
plot(expr[g,], hypometh, main=g)

idx <- str_which(rownames(expr), "ERV")
rownames(expr)[idx]
## [1] "ERVFRDE1"
g <- "ERVFRDE1"
plot(expr[g,], hypometh, main=g)